# ! pip install plotly
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.neighbors import NearestNeighbors
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import plotly.graph_objs as go
import plotly.io as pio;
pio.renderers.default='notebook'
from plotly import tools
from plotly.subplots import make_subplots
import plotly.offline as py
py.init_notebook_mode()
from itertools import product
from sklearn.metrics import silhouette_score
customerDataSet = pd.read_csv("jewellery.csv")
customerDataSet.head()
| Age | Income | SpendingScore | Savings | |
|---|---|---|---|---|
| 0 | 58 | 77769 | 0.791329 | 6559.829923 |
| 1 | 59 | 81799 | 0.791082 | 5417.661426 |
| 2 | 62 | 74751 | 0.702657 | 9258.992965 |
| 3 | 59 | 74373 | 0.765680 | 7346.334504 |
| 4 | 87 | 17760 | 0.348778 | 16869.507130 |
customerDataSet.describe(include='all')
| Age | Income | SpendingScore | Savings | |
|---|---|---|---|---|
| count | 505.000000 | 505.000000 | 505.000000 | 505.000000 |
| mean | 59.019802 | 75513.291089 | 0.505083 | 11862.455867 |
| std | 24.140043 | 35992.922184 | 0.259634 | 4949.229253 |
| min | 17.000000 | 12000.000000 | 0.000000 | 0.000000 |
| 25% | 34.000000 | 34529.000000 | 0.304792 | 6828.709702 |
| 50% | 59.000000 | 75078.000000 | 0.368215 | 14209.932802 |
| 75% | 85.000000 | 107100.000000 | 0.768279 | 16047.268331 |
| max | 97.000000 | 142000.000000 | 1.000000 | 20000.000000 |
customerDataSet.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 505 entries, 0 to 504 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 505 non-null int64 1 Income 505 non-null int64 2 SpendingScore 505 non-null float64 3 Savings 505 non-null float64 dtypes: float64(2), int64(2) memory usage: 15.9 KB
customerDataSet.isnull().sum()
Age 0 Income 0 SpendingScore 0 Savings 0 dtype: int64
As seen in above outputs:
There are 505 customer records in the dataset
There are no null values in any of the features
Age and Income are integer data whereas SpendingScore and Savings are float data.
We see the min value of SpendingScore and Savings is "0" which maybe erroneous and needs verification
print('\nSavings = 0:\n\n',customerDataSet[customerDataSet['Savings']==0].value_counts(),'\n')
print('---------------------------------------------------------\n')
print('SpendingScore = 0:\n\n',customerDataSet[customerDataSet['SpendingScore']==0].value_counts(),'\n')
Savings = 0: Age Income SpendingScore Savings 17 134734 0.857922 0.0 1 dtype: int64 --------------------------------------------------------- SpendingScore = 0: Age Income SpendingScore Savings 86 124372 0.0 15374.465953 1 dtype: int64
Savings = 0
The age of this customer is 17
Given the customer is a minor it is quite possible that he/she has 0 savings and the income/spending is from parents wealth.
SpendingScore = 0
The age of the customer is 86
Given the high age and other dynamics like health, needs etc. related to that age, we can assume that this customer is not spending (at least since few years) and the 0 score is justified
Hence, not dropping these records.
plt.figure(figsize=(20,15))
plt.subplot(331)
plt.scatter(customerDataSet['Age'],customerDataSet['Income'])
plt.title('Age vs Income')
plt.subplot(332)
plt.scatter(customerDataSet['Age'],customerDataSet['SpendingScore'])
plt.title('Age vs SpendingScore')
plt.subplot(333)
plt.scatter(customerDataSet['Age'],customerDataSet['Savings'])
plt.title('Age vs Savings')
plt.subplot(334)
plt.scatter(customerDataSet['Income'],customerDataSet['SpendingScore'])
plt.title('Income vs SpendingScore')
plt.subplot(335)
plt.scatter(customerDataSet['Income'],customerDataSet['Savings'])
plt.title('Income vs Savings')
plt.subplot(337)
plt.scatter(customerDataSet['SpendingScore'],customerDataSet['Savings'])
plt.title('SpendingScore vs Savings')
#plt.tight_layout()
plt.subplots_adjust(left=0.1,
bottom=0.1,
right=0.9,
top=0.9,
wspace=0.4,
hspace=0.4)
plt.show()
Inference from above plots:
The general trend seen is that with increasing age, the customers spending decreases and savings increases.
This is also impacted by age, typically the younger generation is spending more and saving less and older ones are spending less and saving more
There are exceptions although to the general trend as said above
We do observe data where higher aged customers are spending more and higher income customers are saving less. These could be age and lifestyle related expenses like health, children education/marriage etc.
Higher the spending score, lower the savings is the trend observed. There seems to be a strong co-relation (although inverse) between these 2 features.
Hence plotting a heatmap to check co-relation score.
# for correlation
corr = customerDataSet.corr()
sns.set(rc = {'figure.figsize':(10,5)})
sns.heatmap(corr, annot=True, cmap='coolwarm', linewidths=1.9)
<AxesSubplot:>
As seen in scatter plot and the heatmap, there is certainly a high corelation between SpendingScore and Savings. They are inversely related. We can generalize here that a high spender has low savings and a low spender has high savings.
Hence dropping the Savings feature from the dataset. A spending score of the customer seems to be a better metric to use for segmentation and gathering insights from and hence Savings feature is chosen to drop
# Dropping the Savings feature
customerDataSet=customerDataSet.drop(labels='Savings', axis=1)
X = StandardScaler().fit_transform(customerDataSet)
A general heuristics based approach for minpoints is 2 times the number of features.
Using this minpoints value, we can then calculate the nearest neighbour distance and determine the eps value
# Deriving Minpoints
minpoints = 2*customerDataSet.shape[1]
print ('Minpoints = ', minpoints)
Minpoints = 6
# Deriving eps using NearestNeighbors
neigh = NearestNeighbors(n_neighbors=minpoints)
nbrs = neigh.fit(X)
distances, indices = nbrs.kneighbors(X)
distances = -np.sort(-distances, axis=0)
distances = distances[:,1]
plt.plot(distances)
[<matplotlib.lines.Line2D at 0x1e606698070>]
As seen in the plot above, the initial strong dip is seen at 0.2. Hence setting the eps value as 0.2 for clustering
epsValue = 0.2
dbscan = DBSCAN(eps=epsValue,min_samples=minpoints)
y = dbscan.fit_predict(X)
customerDataSet['Cluster']=y
customerDataSet['Cluster'].unique()
array([ 0, 1, 4, 3, 2, -1], dtype=int64)
We observe that 6 clusters have been formed.
Given that one of the clusters holds the value -1, presence of outliers can be confirmed
outliers = list(dbscan.labels_).count(-1)
print('Outlier Data Points: ', outliers)
Outlier Data Points: 29
clusteredDataSet=customerDataSet.copy()
clusteredDataSet = clusteredDataSet[clusteredDataSet['Cluster']!=-1]
clusteredDataSet.groupby(['Cluster']).count()
| Age | Income | SpendingScore | |
|---|---|---|---|
| Cluster | |||
| 0 | 154 | 154 | 154 |
| 1 | 141 | 141 | 141 |
| 2 | 120 | 120 | 120 |
| 3 | 19 | 19 | 19 |
| 4 | 42 | 42 | 42 |
The 29 outliers have been removed and we see the final set of 5 clusters with number of data points belonging to each cluster in the data above
Scene = dict(xaxis = dict(title = 'Age'),yaxis = dict(title = 'Spending Score'),zaxis = dict(title = 'Income'))
# model.labels_ is nothing but the predicted clusters i.e y_clusters
labels = clusteredDataSet['Cluster']
trace = go.Scatter3d(x=clusteredDataSet['Age'], y=clusteredDataSet['SpendingScore'], z=clusteredDataSet['Income'], mode='markers',marker=dict(color = labels, size= 10, line=dict(color= 'black',width = 10)))
layout = go.Layout(margin=dict(l=0,r=0),scene = Scene,height = 600,width = 600)
data = [trace]
fig = go.Figure(data = data, layout = layout)
fig.show()
summaryData = [[0,'Blue','High (50-70)','High (0.6-0.9)','Average (50K-90K)'],[1,'Violet','Very High (79+)','Low (0-0.4)','Low (12K-50K)'],[2,'Pink','Average (25-45)','Low (0-0.4)','High (80K-120K)'],[3,'Orange','Very High (80+)','Very Low (0-0.1)','Very High (110K+)'],[4,'Yellow','Low (18-30)','Very High (0.8+)','Very High (120K+)']]
summaryDataFrame = pd.DataFrame(summaryData,columns=['Cluster #','Cluster Color', 'Age', 'Spending Score', 'Income'])
print('\nClustering Summary: ')
print('---------------------')
summaryDataFrame
Clustering Summary: ---------------------
| Cluster # | Cluster Color | Age | Spending Score | Income | |
|---|---|---|---|---|---|
| 0 | 0 | Blue | High (50-70) | High (0.6-0.9) | Average (50K-90K) |
| 1 | 1 | Violet | Very High (79+) | Low (0-0.4) | Low (12K-50K) |
| 2 | 2 | Pink | Average (25-45) | Low (0-0.4) | High (80K-120K) |
| 3 | 3 | Orange | Very High (80+) | Very Low (0-0.1) | Very High (110K+) |
| 4 | 4 | Yellow | Low (18-30) | Very High (0.8+) | Very High (120K+) |
These are customers nearing retirement/recently retired. Their income and spending scores are average. These customers can be given offers on jewellery/gold that are medium to long term investment oriented, options to buy in smaller chunks and accumulate, goal oriented investment like kids marriage, education etc.
These are customers whose age is very high (super senior citizens) and their income is low. Given the age, their needs for jewellery as an ornament or investment (Gold/Silver etc.) is highly unlikely and the income also doesnt support purchase of jewellery. These customers can be considered for giving offers on best rates on sale of their existing jewellery or low interest rate offers on jewellery/gold loans.
These are customers in the 25-45 years age range with a decent income. However their spending score is low. Given their age they might be investing on house, funds, deposits etc. and must be having a need basis (marriage, special occassions etc.) jewellery purchase plan and hence the low spending score. These customers can be given offers on jewellery/gold that are long term investment oriented, options to buy in smaller chunks and accumulate, goal oriented investment like kids marriage, education etc.
These are customers whose age is very high (super senior citizens) and their income is also very high. Given their high age, the high income could be a result of a steady business or good returns from investments made at an early age. Given the age and their very low spending score their seems to be no need for purchasing jewellery as an ornament. However, considering these customers are business/investment oriented, they can be considered for giving tailored offers portraying jewellery/gold as a high return investment option.
These are young customers with very high income and very high spending score as well. Could be young entrepreneurs/customers in high paying jobs with niche skills. They have a high spending capacity and are comparitively an easier target segment here. They can be offered various offers on jewellery both as an ornament and as an investment.
eps_values = np.arange(0.1, 1.05, 0.05)
min_samples = np.arange(3,12,3)
dbscan_params = list(product(eps_values, min_samples))
no_of_clusters = []
sil_score = []
epsvalues = []
min_samp = []
for p in dbscan_params:
dbscan_cluster = DBSCAN(eps=p[0], min_samples=p[1]).fit(X)
epsvalues.append(p[0])
min_samp.append(p[1])
no_of_clusters.append(len(np.unique(dbscan_cluster.labels_)))
sil_score.append(silhouette_score(X, dbscan_cluster.labels_))
eps_min = list(zip(no_of_clusters, sil_score, epsvalues, min_samp))
eps_min_df = pd.DataFrame(eps_min, columns=['no_of_clusters', 'silhouette_score', 'epsilon_values', 'minimum_points'])
eps_min_df
| no_of_clusters | silhouette_score | epsilon_values | minimum_points | |
|---|---|---|---|---|
| 0 | 25 | -0.240218 | 0.10 | 3 |
| 1 | 7 | -0.354808 | 0.10 | 6 |
| 2 | 7 | -0.428807 | 0.10 | 9 |
| 3 | 10 | 0.592727 | 0.15 | 3 |
| 4 | 5 | 0.527624 | 0.15 | 6 |
| 5 | 5 | 0.376712 | 0.15 | 9 |
| 6 | 7 | 0.639052 | 0.20 | 3 |
| 7 | 6 | 0.745458 | 0.20 | 6 |
| 8 | 6 | 0.677330 | 0.20 | 9 |
| 9 | 6 | 0.796519 | 0.25 | 3 |
| 10 | 6 | 0.819692 | 0.25 | 6 |
| 11 | 6 | 0.799336 | 0.25 | 9 |
| 12 | 6 | 0.701452 | 0.30 | 3 |
| 13 | 6 | 0.701452 | 0.30 | 6 |
| 14 | 6 | 0.701452 | 0.30 | 9 |
| 15 | 5 | 0.834790 | 0.35 | 3 |
| 16 | 5 | 0.834790 | 0.35 | 6 |
| 17 | 5 | 0.834790 | 0.35 | 9 |
| 18 | 5 | 0.834790 | 0.40 | 3 |
| 19 | 5 | 0.834790 | 0.40 | 6 |
| 20 | 5 | 0.834790 | 0.40 | 9 |
| 21 | 5 | 0.834790 | 0.45 | 3 |
| 22 | 5 | 0.834790 | 0.45 | 6 |
| 23 | 5 | 0.834790 | 0.45 | 9 |
| 24 | 5 | 0.834790 | 0.50 | 3 |
| 25 | 5 | 0.834790 | 0.50 | 6 |
| 26 | 5 | 0.834790 | 0.50 | 9 |
| 27 | 5 | 0.834790 | 0.55 | 3 |
| 28 | 5 | 0.834790 | 0.55 | 6 |
| 29 | 5 | 0.834790 | 0.55 | 9 |
| 30 | 5 | 0.834790 | 0.60 | 3 |
| 31 | 5 | 0.834790 | 0.60 | 6 |
| 32 | 5 | 0.834790 | 0.60 | 9 |
| 33 | 5 | 0.834790 | 0.65 | 3 |
| 34 | 5 | 0.834790 | 0.65 | 6 |
| 35 | 5 | 0.834790 | 0.65 | 9 |
| 36 | 5 | 0.834790 | 0.70 | 3 |
| 37 | 5 | 0.834790 | 0.70 | 6 |
| 38 | 5 | 0.834790 | 0.70 | 9 |
| 39 | 5 | 0.834790 | 0.75 | 3 |
| 40 | 5 | 0.834790 | 0.75 | 6 |
| 41 | 5 | 0.834790 | 0.75 | 9 |
| 42 | 5 | 0.834790 | 0.80 | 3 |
| 43 | 5 | 0.834790 | 0.80 | 6 |
| 44 | 5 | 0.834790 | 0.80 | 9 |
| 45 | 5 | 0.834790 | 0.85 | 3 |
| 46 | 5 | 0.834790 | 0.85 | 6 |
| 47 | 5 | 0.834790 | 0.85 | 9 |
| 48 | 5 | 0.834790 | 0.90 | 3 |
| 49 | 5 | 0.834790 | 0.90 | 6 |
| 50 | 5 | 0.834790 | 0.90 | 9 |
| 51 | 5 | 0.834790 | 0.95 | 3 |
| 52 | 5 | 0.834790 | 0.95 | 6 |
| 53 | 5 | 0.834790 | 0.95 | 9 |
| 54 | 5 | 0.834790 | 1.00 | 3 |
| 55 | 5 | 0.834790 | 1.00 | 6 |
| 56 | 5 | 0.834790 | 1.00 | 9 |
As seen from the silhoutte scores table above, we see the best score at EPS = 0.35 and minpoints=6. Applying the DBSCAN algorithm with these parameters and verifying the clusters
epsValue = 0.35
minpoints=6
dbscan = DBSCAN(eps=epsValue,min_samples=minpoints)
y = dbscan.fit_predict(X)
clusteredDataSet=customerDataSet.copy()
clusteredDataSet['Cluster']=y
clusteredDataSet['Cluster'].unique()
array([0, 1, 2, 3, 4], dtype=int64)
outliers = list(dbscan.labels_).count(-1)
print('Outlier Data Points: ', outliers)
Outlier Data Points: 0
We see there are no outliers here and all data points have been clustered
clusteredDataSet = clusteredDataSet[clusteredDataSet['Cluster']!=-1]
clusteredDataSet.groupby(['Cluster']).count()
| Age | Income | SpendingScore | |
|---|---|---|---|
| Cluster | |||
| 0 | 157 | 157 | 157 |
| 1 | 147 | 147 | 147 |
| 2 | 50 | 50 | 50 |
| 3 | 25 | 25 | 25 |
| 4 | 126 | 126 | 126 |
Scene = dict(xaxis = dict(title = 'Age'),yaxis = dict(title = 'Spending Score'),zaxis = dict(title = 'Income'))
# model.labels_ is nothing but the predicted clusters i.e y_clusters
labels = clusteredDataSet['Cluster']
trace = go.Scatter3d(x=clusteredDataSet['Age'], y=clusteredDataSet['SpendingScore'], z=clusteredDataSet['Income'], mode='markers',marker=dict(color = labels, size= 10, line=dict(color= 'black',width = 10)))
layout = go.Layout(margin=dict(l=0,r=0),scene = Scene,height = 600,width = 600)
data = [trace]
fig = go.Figure(data = data, layout = layout)
fig.show()
In the approach of deriving the minpoints and eps using silhoutte score, the key difference we see is that all data points got clustered and none were marked as outliers.
The final clusters look similar to what was observed previously with similar age, spending score and income ranges - we see only minor changes because of increase in eps and min points as compared to heuristics based approach.
Hence the cluster summary and indicative business proposals are also same as furnished earlier.